/*********************************************************************
Purpose: 	Merge and prepare phone survey and baseline covariates for analysis.
*********************************************************************/

	set more off
	clear matrix
	clear all
	*ssc install estout
	
*------------------------------------------------------------------------
	use "$PHONE_DATA_OUT/Phone Survey All Covariates (Plant)_Analysis.dta", clear

	
*****************************************************************************
****** Create Factors of Production from Phone Survey (Gargi's Table 5) *****
*****************************************************************************
* note: this code is all copied from Gargi's code to produce her Aggregated Factors of Production balance check table

	keep if surveyed == 1 
	label var bh_annu_water_cost "BH Annual Water Cost (INR 000's)"
	replace bh_annu_water_cost = bh_annu_water_cost/1000
	label var ph_tot_water_cost  "PH Annual Water Cost (INR 000's)"
	replace ph_tot_water_cost = ph_tot_water_cost/1000*annu_work_days
	label var ph_tot_rawmtrl_cost "Annual Fabric Cost (INR Lakhs)"
	label var ph_tot_chem_cost "Annual PH Chemical Cost (INR 000's)"
	replace ph_tot_chem_cost = ph_tot_chem_cost*100000/1000*annu_work_days
	label var tot_chem_cost_etp "Annual ETP Chemical Cost (INR 000's)"
	replace tot_chem_cost_etp = tot_chem_cost_etp /1000*annu_work_days
	label var bh_annu_chem_cost  "Annual BH Chemical Cost (INR 000's)"
	replace bh_annu_chem_cost = bh_annu_chem_cost /1000
	label var bh_annu_ope_cost_lakh "Annual BH Operating Cost (INR Lakhs)"
	label var bh_annu_maint_cost_lakh "Annual BH Maitnenance Cost (INR Lakhs)"
	label var bh_alt_annu_labor_cost "Alt: BH Annual Labour Cost (INR Lakhs)"
	label var bh_annu_labor_cost  "BH Annual Worker Cost (INR Lakhs)"
	label var bh_annu_elec_cost "BH Electricity Cost"
	label var bh_annu_inputs_cost "BH Inputs Cost"
	label var treat "ETS Treatment=1"
	
//	Generate key aggregate non-BH variables // plant-level variables	
	
	* Total Revenue
		rename tot_rev_19_1 plant_rev
	
	* Capital = bh annualized installation, modification; bh repair, operating, maint
		
		* fixed capital
		gen pv_fixed_capital = pv_instllation_cost + pv_modification_cost
		label var pv_fixed_capital  "Total Fixed Capital (INR Lakhs)"

		* working capital
		* set missing values to 0
		foreach var of varlist bh_total_smallrepair bh_annu_ope_cost_lakh bh_annu_maint_cost_lakh {
			replace `var' = 0 if `var' == .
		}
		
		* create aggregate variable
		gen working_capital = bh_total_smallrepair + bh_annu_ope_cost_lakh + bh_annu_maint_cost_lakh 
		label var working_capital  "Working Capital (INR Lakhs)"
		
		* replace 0 values with missing again
		foreach var of varlist bh_total_smallrepair bh_annu_ope_cost_lakh bh_annu_maint_cost_lakh {
			replace `var' = . if `var' == 0
		}
	
		* total capital (= leave missing as missing; most are fixed capital)
		gen plant_cost_capital = working_capital + pv_fixed_capital
		label var plant_cost_capital  "Total Capital (INR Lakhs)"

	* Labor Costs = office, prod + bh daily, engineer, master 
		
		* create aggregate variable		// can do the c6_ etc. computations if we want 
		generate plant_cost_labor = tot_office_worker_cost * 12 + tot_prod_worker_cost * 12 + bh_annu_labor_cost + bh_engineer_cost + bh_an_master_cost
		label var plant_cost_labor  "Total Labor Cost (INR Lakhs)"
	
	* Fuel Costs = fuel
		gen plant_cost_fuel = total_fuel_cost_1920
		label var plant_cost_fuel  "Total Fuel Cost (INR Lakhs)"
		
		* also create the bl from previous year
		gen plant_cost_fuel_bl = total_fuel_cost_1819
		label var plant_cost_fuel_bl  "Total Fuel Cost (INR Lakhs) - Baseline"
	
	* Electricity Cost = elec
		gen plant_cost_elec = elecbill_2019
		label var plant_cost_elec  "Total Electricity Cost (INR Lakhs)"

	* Material Costs = raw material, water, chem
	
		* set missing values to 0
		foreach var of varlist ph_tot_rawmtrl_cost ph_tot_water_cost bh_annu_water_cost ///
		ph_tot_chem_cost tot_chem_cost_etp bh_annu_chem_cost {
			tab `var'
			replace `var' = 0 if `var' == .
		}
		
		* truncate high values
		tabstat ph_tot_chem_cost , stat(mean p50) save
		matrix temp = r(StatTotal)
		matrix list temp
		local impute temp[2,1]
		di `impute'
		replace ph_tot_chem_cost = `impute' if ph_tot_chem_cost > (10000*annu_work_days) & ph_tot_chem_cost != .
		
		* create aggregate variable
		gen plant_cost_material = ph_tot_rawmtrl_cost + ph_tot_water_cost/100 + bh_annu_water_cost/100 + ///
			ph_tot_chem_cost/100 + tot_chem_cost_etp/100 + bh_annu_chem_cost/100
		label var plant_cost_material "Total Material Cost (INR Lakhs)"
		
		* replace original values back to normal
		*foreach var of varlist ph_tot_rawmtrl_cost ph_tot_water_cost bh_annu_water_cost ///
		*ph_tot_chem_cost tot_chem_cost_etp bh_annu_chem_cost {
		*	replace `var' = . if `var' == 0
		*}
		
//	Rename relevant BH costs 

	gen bh_cost_capital = plant_cost_capital
	label var bh_cost_capital  "BH Capital (INR Lakhs)"
	
	gen bh_cost_labor = bh_annu_labor_cost + bh_engineer_cost + bh_an_master_cost
	label var bh_cost_labor  "BH Labor Cost (INR Lakhs)"
	
	gen bh_cost_elec = bh_annu_elec_cost
	label var bh_cost_elec  "BH Electricity Cost (INR Lakhs)"
	
	gen bh_cost_fuel = total_fuel_cost_1920
	label var bh_cost_fuel  "BH Fuel Cost (INR Lakhs)"
	
	gen bh_cost_fuel_bl = total_fuel_cost_1819
	label var bh_cost_fuel_bl  "BH Fuel Cost (INR Lakhs) - Baseline"
	
	gen bh_cost_material = bh_annu_water_cost/100 + bh_annu_chem_cost/100
	label var bh_cost_material "BH Material Cost (INR Lakhs)"
	
//	Edit plant costs to non-BH costs; except elec (we will report only plant)
	replace plant_cost_capital = plant_cost_capital - bh_cost_capital
	replace plant_cost_labor = plant_cost_labor - bh_cost_labor
	replace plant_cost_material = plant_cost_material - bh_cost_material 
	replace plant_cost_fuel = plant_cost_fuel - bh_cost_fuel
	
//	Keep relevant variables, rename to signal endline
	keep industry_id plant_cost_* bh_cost_* num_cyclones num_bagfilters num_scrubbers num_esps D_cyc D_bf D_scr D_esp
	
	ds industry_id *_bl, not
	foreach var of varlist `r(varlist)' {
		rename `var' `var'_el
	}
	
//	Save to temp file
	tempfile data
	save `data', replace
					
**********************************************************************
****** 		Merge Factors of Production with Baseline Values	 *****
**********************************************************************

use "$BASELINE_DATA_IN/BaselineCovariates_318i.dta", clear

	label var bh_annu_ope_cost_lakh "Annual BH Operating Cost (INR Lakhs)"
	label var bh_annu_maint_cost_lakh "Annual BH Maintenance Cost (INR Lakhs)"
	label var bh_annu_labor_cost  "BH Annual Worker Cost (INR Lakhs)"
	label var bh_annu_elec_cost "BH Electricity Cost"
	label var bh_annu_inputs_cost "BH Inputs Cost"
	label var treat "ETS Treatment=1"

//	Generate key aggregate plant-level variables --> these are for regression controls	
	
	* Total Revenue
		rename grossrev_17_18_clean plant_rev
		replace plant_rev = plant_rev*100 // convert from crore to lakhs
		label var plant_rev "Total Revenue 2017 (INR Lakhs) - Baseline"

	* Capital 
		* set missing values to 0
		foreach var of varlist bh_total_smallrepair bh_annu_ope_cost_lakh bh_annu_maint_cost_lakh {
			replace `var' = 0 if `var' == .
		}
		* create aggregate variable
		gen working_capital = bh_total_smallrepair + bh_annu_ope_cost_lakh + bh_annu_maint_cost_lakh 
		label var working_capital  "Working Capital (INR Lakhs) - Baseline"
		* replace 0 values with missing again
		foreach var of varlist bh_total_smallrepair bh_annu_ope_cost_lakh bh_annu_maint_cost_lakh {
			replace `var' = . if `var' == 0
		}
	
		* Total (= working capital, bc no fixed capital in baseline)
		gen plant_cost_capital = working_capital
		label var plant_cost_capital  "Total Capital (INR Lakhs) - Baseline"
		
	* Labor Costs
		gen plant_cost_labor = bh_annu_labor_cost // we use bh as control
		label var plant_cost_labor  "Total Labor Cost (INR Lakhs) - Baseline"
		
	* Electricity Costs
		gen plant_cost_elec = elecbill_2017
		label var plant_cost_elec  "Total Electricity Cost (INR Lakhs) - Baseline"
		
	* Fuel Costs (= pulled from phone survey)
		
	* Material Costs
		gen plant_cost_material = (annu_work_days * bh_water_cost_perday + 12 * bh_month_chem_cost)/100000
		label var plant_cost_material  "Total Material Cost (INR Lakhs) - Baseline"
	
//	Rename relevant BH costs --> these are for regression controls

	gen bh_cost_capital = plant_cost_capital
	label var bh_cost_capital  "BH Capital (INR Lakhs) - Baseline"
	
	gen bh_cost_labor = bh_annu_labor_cost
	label var bh_cost_labor  "BH Labor Cost (INR Lakhs) - Baseline"
	
	gen bh_cost_elec = bh_annu_elec_cost
	label var bh_cost_elec  "BH Electricity Cost (INR Lakhs) - Baseline"
	
	** fuel from phone survey
	
	gen bh_cost_material = bh_annu_inputs_cost
	label var bh_cost_material "BH Material Cost (INR Lakhs) - Baseline"	
	
	
//	Keep relevant variables, rename to signal baseline
	rename D_treatment treat
	label var treat "ETS Treatment=1"
	keep industry_id treat plant_cost_* bh_cost_* ///
		num_cyclones num_bagfilters num_scrubbers num_esps D_cyc D_bf D_scr D_esp ///
		plant_boi_cap
		
	ds industry_id plant_boi_cap treat, not
	foreach var of varlist `r(varlist)' {
		rename `var' `var'_bl
	}
	
*****************************************
**************** Merge ******************
*****************************************

	sort industry_id
	merge 1:1 industry_id using `data'
	assert _m!=2
	
	gen d_onlybl = (_m==1)
	quietly sum d_onlybl
	assert `r(sum)'==23
	drop _m
	
*****************************************
****** Additional Data Processing  ******
*****************************************

// Top code material costs at 99th percentile within treatment arm, due to outlier

ds *cost_material_*
foreach var of varlist `r(varlist)' {
	foreach status in 0 1 {
		quietly: sum `var' if treat==`status', d
		replace `var' = `r(p99)' if `var' > `r(p99)' & treat==`status'
	}
}

// 	Convert number of Abatement Devices to costs
	** costs pulled from this spreadsheet: https://docs.google.com/spreadsheets/d/1UNsdN85sn891vtabzV8pF1T76U5445cs5ZPWwV2OPDE/edit#gid=0
	** (a) Why do 14 plants have 0 boiler cap? --> powered by TPH and HAGs 
	** (b) Why does one plant (GJS_334) have 183 boiler cap? --> accurate, it's just a large plant
	** follow-up question: how to impute costs for these cases?  We set 0 to missing, and group 183 into the largest bucket.
	
	* create an indicator for boiler cap thresholds per spreadsheet
	gen plant_boi_cap_size = 0 
	replace plant_boi_cap_size = 1 if plant_boi_cap >= 0 & plant_boi_cap < 4 // spreadsheet said lower bound is 1, but we have 0s
	replace plant_boi_cap_size = 2 if plant_boi_cap >= 4 & plant_boi_cap < 8
	replace plant_boi_cap_size = 3 if plant_boi_cap >= 8 & plant_boi_cap < 15
	replace plant_boi_cap_size = 4 if plant_boi_cap >= 15 & plant_boi_cap < 25
	replace plant_boi_cap_size = 5 if plant_boi_cap >= 25 & plant_boi_cap < 185 // spreadsheet said upper bound is 90, but we have one 183 value
	
	* spreadsheet implies large boilers should have no bagfilters and no scrubbers
	ds num_bagfilters_* num_scrubbers_*
	foreach var in `r(varlist)' {
		assert `var'==0 if plant_boi_cap_size==5
	}
	
	* import spreadsheet cost matrix (unit: INR Lakhs)
	local cost_cyclones 4 5.5 6.5 8 8
	local cost_bagfilters 3.75 9 11.5 12 0 
	local cost_scrubbers 6 9.5 15 15 0
	local cost_esps 45 60 100 135 225
	
	* impute costs based on cost matrix
	foreach apcd in cyclones bagfilters scrubbers esps {
		
		* initialize cost variables
		gen `apcd'_unit_cost = 0
		
		* loop over each apcd and all five costs
		local i = 1
		foreach val in `cost_`apcd'' {
			replace `apcd'_unit_cost = `val' if plant_boi_cap_size==`i' 
			local i = `i'+1
		}
		
	}
	drop plant_boi_cap plant_boi_cap_size
	
	* create total apcd cost per plant
	foreach apcd in cyclones bagfilters scrubbers esps {
		gen `apcd'_total_cost_el = num_`apcd'_el * `apcd'_unit_cost
		gen `apcd'_total_cost_bl = num_`apcd'_bl * `apcd'_unit_cost
	}
	
	label var cyclones_unit_cost "Cyclones Unit Cost (INR Lakhs)"
	label var bagfilters_unit_cost "Bag Filters Unit Cost (INR Lakhs)"
	label var scrubbers_unit_cost "Scrubbers Unit Cost (INR Lakhs)"
	label var esps_unit_cost "ESPs Unit Cost (INR Lakhs)"
	label var cyclones_total_cost_el "Cyclones Total Cost (INR Lakhs)"
	label var cyclones_total_cost_bl "Cyclones Total Cost (INR Lakhs) - Baseline"
	label var bagfilters_total_cost_el "Bag Filters Total Cost (INR Lakhs)"
	label var bagfilters_total_cost_bl "Bag Filters Total Cost (INR Lakhs) - Baseline"
	label var scrubbers_total_cost_el "Scrubbers Total Cost (INR Lakhs)"
	label var scrubbers_total_cost_bl "Scrubbers Total Cost (INR Lakhs) - Baseline"
	label var esps_total_cost_el "ESPs Total Cost (INR Lakhs)"
	label var esps_total_cost_bl "ESPs Total Cost (INR Lakhs) - Baseline"
	
//	Turn all costs into USD thousands 
	ds *_cost_* *_cost
	foreach var in `r(varlist)' {
		replace `var' = `var'*$USD2INR/100
	}

//	Only keep plants in endline
	drop if d_onlybl==1
	drop d_onlybl	

//	Save to temp file
	tempfile data
	save `data', replace
	
*****************************************
******* Filter to Analysis Panel  *******
*****************************************

use "$EMISSIONS_DATA_OUT/RuleA_Panel.dta", clear
keep industry_id
quietly bysort industry_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup

* only matched with 283 of the 292 analysis plants
merge 1:1 industry_id using `data', keep(match) nogenerate

save "$PHONE_DATA_OUT/PhoneBaseline.dta", replace
